{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import os\n",
    "import re\n",
    "from collections import Counter\n",
    "from collections import defaultdict\n",
    "from itertools import groupby\n",
    "from pprint import pprint\n",
    "\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.rcParams['figure.dpi'] = 300 #分辨率\n",
    "plt.rcParams['figure.figsize'] = (6.0, 4.0) # 设置figure_size尺寸"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "ann_dir = \"../data/train_ann\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "for ann_fname in os.listdir(ann_dir):\n",
    "    ann_path = os.path.join(ann_dir, ann_fname)\n",
    "    with open(ann_path) as f:\n",
    "        reader = csv.reader(f, delimiter=\"\\t\")\n",
    "        for row in reader:\n",
    "            tag_info  = row[1]\n",
    "            tag_info = re.sub(\"\\d+;\\d+ \", \"\", tag_info)\n",
    "            tp, start, end = tag_info.split()\n",
    "            data.append((tp, int(start), int(end), row[-1]))\n",
    "            \n",
    "data.sort(key=lambda x:x[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<BarContainer object of 15 artists>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ4AAAD8CAYAAABHN8LqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xu8VXWd//HXW1BBQQkRx7zEaCaSF8SDdw0rLZ0m8RcJjJWIxTiljl00GxsvU/3KsbHRnMYI75GSosloeUvRNBUOChzUvNtkXhDvKKLAZ/74frcsj/vc917nHHg/H4/9OGt91309Hvjx+91rv5ciAjMzs7Ks090nYGZmaxcXHjMzK5ULj5mZlcqFx8zMSuXCY2ZmpXLhMTOzUrnwmJlZqVx4zMysVC48ZmZWqr7dfQI90ZAhQ2LYsGHdfRpmZr3KvHnzlkTEpm2t58JTxbBhw2hsbOzu0zAz61Uk/bk963mozczMSuXCY2ZmpXLhMTOzUrnwmJlZqVx4zMysVC48ZmZWKhceMzMrlQuPmZmVyj8greJ/lyzl2Gl3dvdpmJmV6rwv71vKcdzjMTOzUrnwmJlZqXr8UJukTYDf59m/AVYCL+T53SPi7XbuZzLw24h4rvZnaWZm7dXjC09EvAiMBJB0OrA0In7ciV1NBu4DXHjMzLpRjy88rZF0JPA1YD3gj8CxpOHDi0jFSsBU4Pk8P0PSMjrQUzIzs9rqtYVH0o7AYcDeEbFC0lRgAvA4MCQidsrrDYqIVyQdBxwbEfNb2N8UYArAgMGblXINZmZro978cMEngdFAo6T5wMeAbYHHgO0lnSvpU8Cr7dlZREyNiIaIaOg/cFDdTtrMbG3Xa3s8pGG0CyPiX9+3QNoZOJg0DPc5ck/GzMy6X2/u8dwCHC5pCKSn3yRtLWlTQBFxJXAqMCqv/zowsHtO1czMKnptjycimiSdAdwiaR3gHeAY0uPWF0gSEMC38yYXAdP8cIGZWffqVYUnIk5vNv8r4FdVVt21yra/Bn5dnzMzM7P26lWFpyxbDxlQWmaRmdnapjd/x2NmZr2QezxVOJ3abM3kkYyewT0eMzMrVV0Lj6SVkuZLekDSAknfzE+gIalB0rn1PL6ZmfU89R5qWxYRlYDPoaQn0DYCTouIRqCxzsc3M7MeprShtohYTEoQOFbJGEnXAUj6WO4ZzZd0v6SBuf1ESXMlLcy/2SG3/0bSvNyTmpLb+ki6WNIiSU2Svp7bt5V0Q17/D5KGl3XNZmb2fqU+XBART0jqAwxttuhbwNci4i5JA4C3JB0EbAfsTorHmSVp/4i4A5gcES9J6g/MlTQTGAZsERE7QgoHzfueChwTEY9K2gP4GfDx5ufmkFAzs3L0lKfa7gLOljQduDoins6F5yDg/rzOAFIhugM4XtJhuX2r3P4wsI2knwLXAzflIrY3cGUKMgBg/WonEBFTSUWKocOGR42vz8zMslILj6RtSJE2i4EdKu0R8SNJ1wOHAHflVGkBP4yInzfbxxhSMvVeEfGmpNlAv4h4WdIuwKdI0TmHAycAr1S+ZzIzs+5X2nc8ObzzfOC8iIhmy7aNiKaIOBOYCwwHbgQm514LkrbIDyhsDLyci85wYM+8fAiwTkTMBL4LjIqI14AnJX0+r6NcnMzMrJvUu8fTP78rZ11gBXAZcHaV9U6QdACwCngA+F1ELJe0A3B3HiZbCnwBuAE4RtJDpOG1e/I+tgAuqjyuDXwn/z0C+G9J383ncQWwoLaXaWZm7VXXwhMRfVpZNhuYnaePa2Gdc4Bzqiw6uIXdjmreEBFPAp9u41TNzKwkPeXhgh7FIaFmZvXjyBwzMyuVezxVOCTUejL3xq23c4/HzMxKVdPCI2mspOhKLI2kSZI+WMvzMjOznqPWPZ6JwJ35b2dNAlx4zMzWUDUrPPmHnvsCRwMTctsYSbMlXSXpT5KmK/8oR9KpOQB0kaSp+ced44AGYHoODO0v6RM5OLRJ0oWS1s/bPyXph3m9RkmjJN0o6XFJx+R1LpU0tnCO0yUdWqtrNjOzjqtlj+dQ4IaIeAR4UdJuuX1XUnTNCGAbYJ/cfl5EjM6hnv2Bz0TEVaRXJRyRY24CuBgYHxE7kR6G+KfCMf83r/eHvN44UpJBJcn6AlIPCkkbk3Lbrq/hNZuZWQfVsvBMJKUCkP9WhtvmRMTTEbEKmE9KkQY4QNK9kppIadEfrbLP7YEnczEDuATYv7B8Vv7bBNwbEa9HxAvAckmDIuJ2YLsc1zMRmBkRK6qdvKQpuefUuOz1Vzp46WZm1l41eZxa0mBS8dhJUgB9SL2V64HlhVVXAn0l9SO9nqAhIv4i6XSgXycOXdn3qmbHWcXqa7uUFLUzATiqpR05ndrMrBy16vGMAy6LiA9FxLCI2Ap4EtivhfUrRWZJ/m5oXGHZ68DAPP0wMEzSh/P8F4HbO3huF5OG+oiIBzu4rZmZ1VitCs9E4JpmbTNp4em2iHgF+AWwiJRCPbew+GLg/BwuKlIv5co8JLeKlHDdbhHxPPAQcFFHtjMzs/pQszcUrHEkbUD6DmhURLzanm2GDhseh393Wn1PzKyTnFxgPZWkeRHR0NZ6a3RygaRPkno7P21v0TEzs/pa43s8ndHQ0BCNjY3dfRpmZr1Ke3s8DgmtwiGh3c/DSWZrrjV6qM3MzHqeHtvjkbSS9FBA5bXZlwI/yT9ENTOzXqrHFh5gWY7DQdJQ4FfARsBpxZUk9W0pjcDMzHqeXjHUFhGLgSnAsTlMdJKkWZJuBX6fw0ivq6wv6TxJk/L0ITmgdJ6kc4vrmZlZ+XpF4QGIiCdIUTxDc9MoYFxEfKylbXI0z8+BgyNiN2DTup+omZm1qtcUnipujoiX2lhnOPBERDyZ5y9vaUWHhJqZlaPXFB5J25BCRhfnpjcKi1fw3mvpcOBoREyNiIaIaOg/cFDnT9TMzFrVKwpPfq3B+aR3+FT7xeufgRGS1pc0CPhEbn8Y2EbSsDw/vt7namZmrevJT7X1z0GhlcepLwPOrrZifrXCr0mho08C9+f2ZZK+Ctwg6Q3eG0ZqZmbdoMcWnojo08qyi0kp1sW2k4CTqqx+W0QMz6/c/i/SG07NzKyb9Iqhti76Su45PQBsTHrKzczMuolDQqtwSKiZWcc5JLQL1uaQUIdzmlm9rQ1DbWZm1oO48JiZWalceMzMrFStFh5Jm0ianz/PSfprYX699h5E0mRJf9PK8u9J+l6ztgZJTW3s905JI9t7HmZm1v1aLTwR8WJEjMyvJzif9D6ckfnzdgeOMxlosfCQMtSapwpMoJVsNTMz6506PdQm6UhJc3Lv52eS1pHUV9JlkpokLZJ0vKTxwEhgRks9pYh4EFgmabe8bwGfB67I81NzgOcDkk6tci59Jb1SmJ8gaVqe3kzS1Xn7OZL2bOF6HBJqZlaCThUeSTsChwF7595QX1IPZTdgSETsFBE7ApdGxAxgPjC+jZ7S5XkfAPsAz+ZXIQCcnJ8N3wU4UNKIDpzuucC/5+0PB6ZVW8khoWZm5ejs73g+CYwGGlPnhP7AX4Abge0lnQtcD9zUgX1eAcyWdBLvH2abKOnofL4fBEYAD3bgXLfP5wnwAUn9I2JZB87NzMxqpLOFR8CFEfGv71sg7QwcDHwN+BzpzaFtioinJD0D7EfqTTXk/W0H/DOwe0S8IumXvP+1B6vyOVUUlytv25HvpMzMrE46+x3PLcDhkobAu0+/bZ1fX6CIuBI4lfSWUIDXgYHt2O/lpKGxP0XEs7lto7z9a5I2Bz7VfKOIWAW8LGk7SeuQClfxXL9WmfFTcGZm3atTPZ6IaJJ0BnBL/g/9O8AxpBe1XZAfDgjg23mTi4BpkpbReu/j18BP8r4q7iMNq/2J9N6du1rY9tukob7FwDxg/dz+NeC/JR1Fut7bKBQiMzMrl0NCq3BIqJlZx7U3JNTJBWZmVqrS06klzQK2btb8rYi4pexzaUl3plM7HdrM1nSlF56I+GzZxzQzs56jy0Ntkk7JiQILczLBHrU4sSrHGSTpq/XYt5mZladLPR5JewGfAUZFxPL8eHW7w0M7aBDwVeBnddq/mZmVoKs9ns2BJRGxHCAilgDDJf2msoKkAyVdk6eXSjor95BukbS7pNmSnpD02bzOJEnX5vZHJZ2Wd/UjYNvcqzpLyVk5E64pZ8IhaYyk2/M+npD0I0lH5Jy2JknbdvGazcysC7paeG4CtpL0SA4K/RjpdzLD849JAY4CLszTGwK3RsRHST8K/T5wIOkHn/9W2O/upNSDnYHPS2oATgYez3lvJwL/jxQ+ugspFues/ANTctsxwA7AF4GPRMTupJy247p4zWZm1gVdKjwRsZQUDDoFeAGYARwJXAZ8QdIgYC/gd3mTt4Eb8nQTcHtEvJOnhxV2fXN+JcMy4Gqg2qNe+wKXR8TKiHgeuJ2UHwcwNyKezT2xx1mdGdf8OO9yOrWZWTm6/FRbRKwEZpMCPptIhecfgf8B3gKujIgVefV3YvUvVlcBlSG6VZKK59L8V60d/ZXr8sL0qsL8Klq45oiYCkwFGDpsuH9Va2ZWJ13q8UjaPod4VowE/hwRzwDPAN8lxeV01IGSBkvqD4wlxeQ0z3v7AzBeUp88rLc/MKcz12FmZuXpao9nAPDTPKS2AniM1WnU04FNI+KhTux3DjAT2BL4ZUQ0Aki6S9Ii0tDdSaRhvAWkHtFJEfGcpOFduSAzM6uvLhWeiJgH7N3C4n2BXzRbf0Bh+vSWlgFPR8TYKsf7h2ZNJ+ZPcZ3ZpKG/yvyYlpaZmVn56pJcIGke8AbwzXrs38zMei+nU1fhdGozs45rbzp16VltvUGZIaEOBTWztY1fi2BmZqUqrccjaSXpB5x9gSeBL0aEf6lpZraWKbPHsyzH3ewIvIRfP21mtlbqrqG2u4EtKjOSTpQ0N79a4YxC+28kzcuholNyWx9JFxfCQb+e20dKuifv4xpJH8jtsyWdmUNCH5G0X8nXamZmBaUXHkl9gE8As/L8QcB2pGDQkcBukvbPq0+OiN2ABuB4SZvkdbaIiB0jYidWJyNcCnw7InYmDelVUq0B+uaQ0BOatZuZWcnKLDz9Jc0HngM2A27O7Qflz/3AfcBwUiGCVGwWAPcAW+X2J4BtJP1U0qeB1yRtDAyKiNvzdpeQInQqrs5/5+GQUDOzblX6dzzAhwCx+jseAT/M3/+MjIgPR8QFksaQXnewV0TsQipM/SLiZdJrD2aTXn0wrR3HroSErqSVkNCIaIiIhv4DB3XuCs3MrE2lD7VFxJvA8cA3cyL1jcBkSQMAJG0haSiwMfByRLyZ89f2zMuHAOtExExSCOmoiHgVeLnw/c0XSa9JMDOzHqZbfkAaEfdLWghMjIjLJO0A3C0JYCnwBdJ7e46R9BDwMGm4DdJDCRdJqhTN7+S/RwLnS9qANBx3VDlXY2ZmHVFa4WkWAkpE/H1h+hzgnCqbHdzC7kZV2f98cq+oWfuYwvQSWviOx8zMyuHkAjMzK5Wz2qrYesgAZ6iZmdWJC08VHQkJdYEyM+sYD7WZmVmpelzhkbS0jvueJOm8eu3fzMza1uMKj5mZrdl6ReGRtKmkmTlIdK6kfSStI+kpSYMK6z0qabNq63fn+ZuZ2Wq9ovCQfuPzk4gYDXwOmBYRq4BrgcMAJO0B/Dkinq+2fvectpmZNddbnmr7JDAiJxsAbJQjdmYAp5ISqifk+dbWb1F+7cIUgAGDN6vpyZuZ2Wq9pfCsA+wZEW8VGyXdDXxY0qbAWOD7bazf4gEiYiowFWDosOFRu1M3M7Oi3jLUdhNwXGVG0kiAiAjgGuBs4KGIeLG19c3MrPv1xMKzgaSnC59vkNKsG/LbRR8kvQ6hYgYpVHRGoa219c3MrBv1uKG2iGipGI5vYf1G0jt9im1Lqq0fERcDF3ftDM3MrCt6Yo/HzMzWYD2ux9MTOCTUzKx+XHiqaCsk1EXJzKzzPNRmZmalcuExM7NSlVJ4JI2VFJKGl3G8Fs7hBEkbdNfxzcwsKavHMxG4M//tLicALjxmZt2s7oUnZ6TtCxxNylND0hhJt0u6VtITkn4k6QhJcyQ1Sdo2rzdM0q35h6C/l7R1br9Y0rjCMZYW9jtb0lWS/iRpupLjgQ8Ct0m6rd7XbGZmLSujx3MocENEPAK8KGm33L4LKVFgB+CLwEciYndSknQl7uanwCURsTMwHTi3HcfbldS7GQFsA+wTEecCzwAHRMQB1TaSNEVSo6TGZa+/0pnrNDOzdiij8EwErsjTV7B6uG1uRDwbEcuBx0n5agBNwLA8vRfwqzx9Gann1JY5EfF0fm3C/MK+WhURUyOiISIa+g8c1PYGZmbWKXX9HY+kwcDHgZ0kBdAHCOB6YHlh1VWF+VXtOK8V5KIpaR1gvcKy4n5XtmNfZmZWonr3eMYBl0XEhyJiWERsBTwJ7NfO7f9I/l4IOAL4Q55+CqgM2X0WWLcd+3odGNjO45qZWZ3Uu/BMJL22oGgm7X+67TjgKEkLSd8D/XNu/wXwMUkLSMNxb7RjX1OBG/xwgZlZ91J6pY0VDR02PA7/bstvy3ZkjpnZ+0maFxENba3n7z+qcEiomVn9ODLHzMxK5R5PFa2lU7snZGbWNe7xmJlZqdpVeCRtIml+/jwn6a+F+fXa3kOr+54s6W+6so+edBwzM2tdu4baIuJFYCSApNOBpRHx4+I6kkR6Sm5VB89hMnAf8FwHt+uoso5jZmat6NJQm6QPS3pQ0nTgAWBzSQdLulvSfZJmSNowr3uGpLmSFkk6P4d3jicVtBmV3pOkpyX9f0kL8vqjJN0k6XFJXykc++QcKrpQ0qmF81kk6QJJD0j6naR+1Y7Tles2M7POq8V3PMOBn0TECOAd4GTgExExCljI6h99nhMRo4GdgI2BT0fEDFKe2viIGBkRb+d1n4yIXYB7gAuAw4C9ge8BSDoE2BrYg1RQ9pa0d952e+A/I+KjwDJgbCvHMTOzktXiqbbHI6IxT+9NSoX+Yxp5Yz3Se3gAPiHpRKAfMASYB/yuhX3Oyn+bgL4R8QbwhqRV+TULBwEHA/fn9QYAHwEWA49FRFNun0c7Q0IlTQGmAAwYvFl7NjEzs06oReEpxtWI9AqELxZXyG/+PA8YFRF/lfR9UgFqSTEwtHmYaN98nO9HxAXNjvNhOhkSGhFTSbE6DB023HEOZmZ1UuvHqf9IylDbBkDShpK2A/qTisYSSQOBzxW26Ux4543A0YXvj7aUNKSNbRwSambWA9T0B6QR8byko0lf4le+wP+XiLhe0iXAg8CzwL2FzS4CpklaBuzezuP8VtJw4J48pPc68A9tbPae4/h7HjOz7uGQ0CpaCwl1coGZWXXtDQl1coGZmZXKWW1VOJ3azKx+XHiqcEiomVn9eKjNzMxKVdPCI2lljqR5IEfefFNSzY4haZKkDxbmp0kaUav9m5lZ/dV6qG1ZRFTCRIcCvwI2Ak5r7w4k9YmIlS0sngQsAp4BiIgvd+lszcysdHUbaouIxaQImmNzIOgkSedVlku6TtKYPL1U0n9IWgDsJenUQqDo1Lz9OKABmJ57Vf0lzZbUkPcxUVJT3ubMwnGWSvpB7oHdI8l5OGZm3aiu3/FExBNAH2BoG6tuCNwbEbtExJ3AeRExOiJ2JKUefCYirgIagSNy0OeyysZ5+O1M4OOk0NDRksYW9n1PDh29A/gKZmbWbXrKwwUrgZmF+QMk3SupiVRMPtrG9qOB2RHxQkSsAKYD++dlbwPX5ekWQ0MlTZHUKKlx2euvdPIyzMysLXUtPDmzbSUpNXpFs+MVQ0LfqnyvI6kf8DNgXETsBPyC1gNF2/JOrI5naDE0NCKmRkRDRDT0HzioC4czM7PW1K3wSNoUOJ80bBbAU8BISetI2oqWc9kqRWZJfgXCuMKyloI+55DCSYdI6gNMBG6vwWWYmVmN1fqptv6S5gPrkno4lwFn52V3AU+SgkIfIr2G+n0i4hVJvyA9vfYcMLew+GLg/Bz0uVdhm2clnQzcRnplwvURcW0Nr8vMzGrEIaFVOCTUzKzjHBJqZmY9krPaqnBIqJlZ/bjwVNFSSKiLkZlZ13mozczMSlVK4ZEUkn5ZmO8r6QVJ17Wx3UhJhxTmT5f0rS6cR5e2NzOzriurx/MGsKOk/nn+QOCv7dhuJHBIm2uZmVmvUeZQ22+Bv8vTE4HLKwskbSjpQklzJN0v6VBJ6wH/BozPoaDj8+ojcjjoE5KOL+zjGzkgdJGkEwrtp0h6RNKdwPZ1v0ozM2tVmYXnCmBCjsTZGbi3sOwU4NaI2B04ADiL9CPUU4EZORR0Rl53OPApUvLBaZLWlbQbcBSwB7An8BVJu+b2CazuOY2u90WamVnrSnuqLSIWShpG6u38ttnig4DPFr5/6Qds3cKuro+I5cBySYuBzYB9gWsi4g0ASVcD+5EK6zUR8WZun9XS+UmaQnqNAwMG+80JZmb1Uvbj1LOAHwNjgE0K7QI+FxEPF1eWtEeVfSwvTLcY+tlRETEVmAopuaAW+zQzs/cr+3HqC4EzIqKpWfuNwHGSBCBp19zeUihoc38AxkraQNKGwGG57Y7c3l/SQODva3ERZmbWeaUWnoh4OiLOrbLoe6TvdBZKeiDPQwr9HNHs4YJq+72PFCA6h/Td0bSIuD+3zwAWAL/jvYGjZmbWDRwSWkVLIaFOLjAza5lDQs3MrEdyVlsVDgk1M6sfF54qqoWEuhCZmdWGh9rMzKxULjxmZlaq0gtPzk57QNLC/Jj0HpKmSRpRZd1Jks5rY3+TctL1/MJnRF52Vj7WWZI2lXRvzoLbr17XZ2ZmrSv1Ox5JewGfAUZFxHJJQ4D1IuLLXdz1jIg4tkr7FGBwRKyUNAFoqsGxzMysC8ru8WwOLMlZa0TEkoh4JqdNNwBIOiqnSc8B9qlsmHssMyXNzZ99qh/i3fVnAQOAeZK+Dfw7cGjuEfVvbVszM6ufsp9quwk4VdIjwC2knsrtlYWSNgfOAHYDXiUlF9yfF58D/CQi7pS0NSlmZ4e8bLyk4mNne0XEZyUtjYiRed/PAw0t9IwcEmpmVpJSC09ELM2vKtiP9PqDGZJOLqyyBzA7Il4AkDQD+Ehe9klSfE5l3Y0kDcjTLQ21deTcHBJqZlaC0n/HExErgdnAbElNwJHt3HQdYM+IeKvYWChEZmbWC5T6HY+k7SVtV2gaCfy5MH8v8DFJm0haF/h8YdlNwHGFfY2s68mamVldlP1wwQDgEkkPSloIjABOryyMiGfz/N3AXcBDhW2PBxryY9gPAscUllVej1357F3n6zAzs05yOnUV1dKpHZljZta69qZTO6utCoeEmpnVjyNzzMysVO7xVNE8ndq9HzOz2nGPx8zMStXpwiNpS0nXSnpU0uOSzpG0Xi1PrsoxJ0n6YGG+ariomZn1XJ0qPEq/2rwa+E1EbEdKFxgA/KCrJySpTyuLJwHvFp6I+HJEPNjVY5qZWXk62+P5OPBWRFwE76YRfB2YLOmruSc0O/eGTqtsJOkLkubk39r8vFJkJC2V9B+SFgB7STo1B4EukjRVyTigAZheCfpsFi46UVJT3ubMwjGXSvqBpAWS7pHkIDYzs27U2cLzUWBesSEiXgP+l/TAwu7A54Cdgc9LapC0AzAe2CcHd64EjsibbwjcGxG7RMSdwHkRMToidgT6A5+JiKuARuCIiBgZEcsqx87Db2eSCuJIYLSksYV93xMRuwB3AF/p5DWbmVkN1Ouptpsj4kUASVcD+wIrSKnTc3O+Wn9gcV5/JTCzsP0Bkk4CNgAGAw8A/9PK8Ubz3nDR6cD+wG+At4Hr8nrzgAOr7cDp1GZm5ehs4XkQGFdskLQRsDWpwDSPQwhAwCUR8Z0q+3srD9chqR/wM9IrDP4i6XSgXyfPE+CdWB3PsJIWrtnp1GZm5ejsUNvvgQ0kfQnefSDgP4CLgTeBAyUNzi9cG0vKXfs9ME7S0LzNYEkfqrLvSpFZkl97UCxwrwMDq2wzhxQuOiSfy0Tg9irrmZlZN+tU4ck9iMNI3988CjwCvAX8S15lDmnobCEwMyIa89Nn3wVuygGhN5PeSNp8368AvwAWkV72Nrew+GLg/OZvEc3hoieTXhy3AJgXEdd25trMzKy+ah4SKmkSrbzpszdoHhLq5AIzs7a1NyTUyQVmZlYqvxahioaGhmhsbOzu0zAz61Xc4+mCSkhoMSjUzMxqw4XHzMxK1e7f8UhaCTQVmsZGxFM1PyMzM1ujdeQHpMty1E1VkvpGxIoanJOZma3BujTUll9TMEvSraQfiCLpxBzwuVDSGYV1T5H0iKQ7JV0u6Vu5vRj0OUTSU3m6j6SzCvv6x9w+Jm9zlaQ/SZqe07KRNFrSH3Mg6BxJAyXdIWlk4TzulLRLV67bzMw6ryM9nv6S5ufpJyPisDw9Ctg5Il6SdBCwHSkkVMAsSfsDbwATSAGefYH7aBYyWsXRwKsRMVrS+sBdkm7Ky3YlBZU+Q0pF2EfSHGAGMD4i5uYIn2XABaTXKZwg6SNAv4hY0IHrNjOzGqrFUNvNEfFSnj4of+7P8wNIhWggcE1EvAkgaVY7jncQsHN+HQLAxnlfbwNzIuLpvK/5wDDgVeDZiJgL76ZlI+lK4F8lnQhMJqUfvI9DQs3MylGLdOo3CtMCfhgRPy+uIOmEVrZfweohv2IYqIDjIuLGZvsaAywvNLUY/AkQEW9Kuhk4FDiclJBdbT2HhJqZlaDWj1PfSHoZ3AAASVvkUNA7gLH55W0Dgb8vbPMUq4vBuGb7+idJ6+Z9fUTShq0c+2Fgc0mj8/oDJVUK0jTgXGBuRLzcpSs0M7Muqen7eCLipvzCt7vz9/1LgS9ExH2SZpACPBfz3uDPHwO/zkNd1xfap5GG0O7LDw+8QEq6bunYb0saD/w0B4guAz4JLI2IeZJeAy6q0aWamVkndUtkTn7HztKI+HFJx/sgMBsYHhGr2lq/GBLqgFAzs/ZxZE6W3xl0L3BKe4qOmZnVl0NCq3BIqJlZx7nHY2ZmPZILj5mZlcqFx8zMSuXCY2ZmpXLhMTOzUrnwmJlZqVx4zMysVC48ZmZWKhceMzMrlZMLqpBaCs7CAAAEUElEQVT0Oint2qobAizp7pPooXxvWuf707refn8+FBGbtrVSTdOp1yAPtyf2YW0lqdH3pzrfm9b5/rRubbk/HmozM7NSufCYmVmpXHiqm9rdJ9DD+f60zPemdb4/rVsr7o8fLjAzs1K5x2NmZqVy4SmQ9GlJD0t6TNLJ3X0+9STpQkmLJS0qtA2WdLOkR/PfD+R2STo335eFkkYVtjkyr/+opCML7btJasrbnCtJ5V5h50naStJtkh6U9ICkf87tvj+ApH6S5khakO/PGbn9byXdm69phqT1cvv6ef6xvHxYYV/fye0PS/pUob1X/1uU1EfS/ZKuy/O+N0UR4U8abuwDPA5sA6wHLABGdPd51fF69wdGAYsKbf8OnJynTwbOzNOHAL8DBOwJ3JvbBwNP5L8fyNMfyMvm5HWVtz24u6+5A/dmc2BUnh4IPAKM8P159/4IGJCn1yW9Wn5P4NfAhNx+PvBPefqrwPl5egIwI0+PyP/O1gf+Nv/767Mm/FsEvgH8Crguz/veFD7u8ay2O/BYRDwREW8DVwCHdvM51U1E3AG81Kz5UOCSPH0JMLbQfmkk9wCDJG0OfAq4OSJeioiXgZuBT+dlG0XEPZH+FV1a2FePFxHPRsR9efp14CFgC3x/AMjXuTTPrps/AXwcuCq3N78/lft2FfCJ3MM7FLgiIpZHxJPAY6R/h73636KkLYG/A6bleeF78x4uPKttAfylMP90blubbBYRz+bp54DN8nRL96a19qertPc6eehjV9L/1fv+ZHkoaT6wmFRQHwdeiYgVeZXiNb17H/LyV4FN6Ph96y3+EzgJWJXnN8H35j1ceKyq/H/ia/Ujj5IGADOBEyLiteKytf3+RMTKiBgJbEn6v/Dh3XxKPYKkzwCLI2Jed59LT+bCs9pfga0K81vmtrXJ83kYiPx3cW5v6d601r5llfZeQ9K6pKIzPSKuzs2+P81ExCvAbcBepCHGSgxX8ZrevQ95+cbAi3T8vvUG+wCflfQUaRjs48A5+N68hwvPanOB7fLTJ+uRvuib1c3nVLZZQOXJqyOBawvtX8pPb+0JvJqHnG4EDpL0gfyE10HAjXnZa5L2zOPVXyrsq8fL53wB8FBEnF1Y5PsDSNpU0qA83R84kPQ92G3AuLxa8/tTuW/jgFtzj3EWMCE/2fW3wHakhy567b/FiPhORGwZEcNI531rRByB7817dffTDT3pQ3o66RHSePUp3X0+db7Wy4FngXdI48RHk8aWfw88CtwCDM7rCvivfF+agIbCfiaTvvh8DDiq0N4ALMrbnEf+sXJv+AD7kobRFgLz8+cQ3593z31n4P58fxYBp+b2bUj/cXwMuBJYP7f3y/OP5eXbFPZ1Sr4HD1N4sm9N+LcIjGH1U22+N4WPkwvMzKxUHmozM7NSufCYmVmpXHjMzKxULjxmZlYqFx4zMyuVC4+ZmZXKhcfMzErlwmNmZqX6P+0G7StLsq5BAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "class_count = {}\n",
    "for k, g in groupby(data, key=lambda x: x[0]):\n",
    "    entities = [item[-1] for item in g]\n",
    "    class_count[k] = len(entities)\n",
    "\n",
    "cnt_sort = sorted(class_count.items(), key=lambda x: x[1])\n",
    "names, counts = zip(*cnt_sort)\n",
    "\n",
    "plt.barh(range(len(counts)), counts,  height=0.9, color='steelblue', alpha=0.8, tick_label=names) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "实体类型与数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=================================\n",
      "[ Amount ]::\n",
      "[('<num>', 2651),\n",
      " ('<letter>', 2147),\n",
      " ('/', 464),\n",
      " ('.', 284),\n",
      " ('~', 182),\n",
      " ('量', 119),\n",
      " ('剂', 106),\n",
      " ('μ', 93),\n",
      " ('克', 48),\n",
      " ('毫', 38),\n",
      " ('%', 36),\n",
      " ('大', 34),\n",
      " ('小', 29),\n",
      " ('(', 27),\n",
      " (')', 25),\n",
      " ('单', 25),\n",
      " ('位', 24),\n",
      " ('高', 21),\n",
      " ('每', 19),\n",
      " ('次', 19),\n",
      " ('低', 18),\n",
      " ('片', 17),\n",
      " ('·', 17),\n",
      " ('、', 13),\n",
      " ('天', 13),\n",
      " ('重', 12),\n",
      " ('日', 11),\n",
      " ('千', 10),\n",
      " ('体', 10),\n",
      " ('∥', 9),\n",
      " ('起', 8),\n",
      " ('-', 8),\n",
      " ('粒', 8),\n",
      " ('始', 7),\n",
      " ('最', 7),\n",
      " ('中', 7),\n",
      " ('<', 7),\n",
      " ('±', 6),\n",
      " ('为', 6),\n",
      " ('较', 5),\n",
      " ('>', 4),\n",
      " ('周', 4),\n",
      " ('于', 4),\n",
      " (',', 4),\n",
      " ('增', 3),\n",
      " ('和', 3),\n",
      " ('滋', 3),\n",
      " ('≥', 3),\n",
      " ('至', 3),\n",
      " ('微', 3)]\n",
      "=================================\n",
      "[ Anatomy ]::\n",
      "[('<letter>', 37430),\n",
      " ('<num>', 8814),\n",
      " ('细', 4993),\n",
      " ('胞', 4970),\n",
      " ('-', 2981),\n",
      " ('胰', 1908),\n",
      " ('β', 1828),\n",
      " ('岛', 1546),\n",
      " ('素', 1395),\n",
      " ('骨', 1388),\n",
      " ('脂', 1279),\n",
      " ('肪', 1109),\n",
      " ('肾', 1077),\n",
      " ('脏', 1066),\n",
      " ('血', 908),\n",
      " ('管', 820),\n",
      " ('白', 779),\n",
      " ('腺', 779),\n",
      " ('组', 772),\n",
      " ('织', 739),\n",
      " ('体', 720),\n",
      " ('蛋', 678),\n",
      " ('皮', 670),\n",
      " ('内', 652),\n",
      " ('肝', 632),\n",
      " ('因', 592),\n",
      " ('基', 546),\n",
      " ('酶', 491),\n",
      " ('脉', 483),\n",
      " ('成', 464),\n",
      " ('肌', 456),\n",
      " ('质', 423),\n",
      " ('状', 402),\n",
      " ('小', 393),\n",
      " ('动', 390),\n",
      " ('子', 387),\n",
      " ('激', 377),\n",
      " ('膜', 370),\n",
      " ('肠', 363),\n",
      " ('酸', 349),\n",
      " ('心', 342),\n",
      " ('甲', 342),\n",
      " ('经', 341),\n",
      " ('神', 340),\n",
      " ('受', 332),\n",
      " ('糖', 316),\n",
      " ('化', 311),\n",
      " ('性', 310),\n",
      " ('系', 302),\n",
      " ('+', 250)]\n",
      "=================================\n",
      "[ Disease ]::\n",
      "[('<letter>', 25304),\n",
      " ('病', 12577),\n",
      " ('糖', 9815),\n",
      " ('尿', 9321),\n",
      " ('<num>', 4960),\n",
      " ('型', 3834),\n",
      " ('血', 3529),\n",
      " ('性', 2674),\n",
      " ('心', 1920),\n",
      " ('肾', 1693),\n",
      " ('症', 1653),\n",
      " ('肥', 1614),\n",
      " ('胖', 1600),\n",
      " ('甲', 1470),\n",
      " ('管', 1412),\n",
      " ('高', 1349),\n",
      " ('腺', 1258),\n",
      " ('疾', 1145),\n",
      " ('合', 982),\n",
      " ('征', 928),\n",
      " ('综', 918),\n",
      " ('代', 901),\n",
      " ('谢', 892),\n",
      " ('状', 859),\n",
      " ('能', 835),\n",
      " ('功', 826),\n",
      " ('骨', 790),\n",
      " ('胰', 787),\n",
      " ('素', 778),\n",
      " ('化', 745),\n",
      " ('变', 738),\n",
      " ('发', 726),\n",
      " ('脂', 723),\n",
      " ('脉', 716),\n",
      " ('动', 699),\n",
      " ('瘤', 682),\n",
      " ('压', 624),\n",
      " ('炎', 614),\n",
      " ('低', 613),\n",
      " ('癌', 609),\n",
      " ('岛', 580),\n",
      " ('减', 580),\n",
      " ('质', 572),\n",
      " ('死', 566),\n",
      " ('结', 544),\n",
      " ('冠', 530),\n",
      " ('肝', 524),\n",
      " ('硬', 517),\n",
      " ('常', 513),\n",
      " ('重', 500)]\n",
      "=================================\n",
      "[ Drug ]::\n",
      "[('<letter>', 9977),\n",
      " ('素', 3186),\n",
      " ('<num>', 2600),\n",
      " ('胰', 2035),\n",
      " ('岛', 1973),\n",
      " ('列', 1666),\n",
      " ('格', 1626),\n",
      " ('甲', 1298),\n",
      " ('剂', 1279),\n",
      " ('二', 1270),\n",
      " ('-', 1234),\n",
      " ('药', 1096),\n",
      " ('双', 1095),\n",
      " ('类', 1073),\n",
      " ('胍', 1059),\n",
      " ('物', 1049),\n",
      " ('糖', 984),\n",
      " ('汀', 970),\n",
      " ('脲', 776),\n",
      " ('利', 749),\n",
      " ('肽', 744),\n",
      " ('制', 659),\n",
      " ('抑', 615),\n",
      " ('拉', 493),\n",
      " ('激', 488),\n",
      " ('鲁', 464),\n",
      " ('磺', 457),\n",
      " ('他', 448),\n",
      " ('酮', 435),\n",
      " ('维', 380),\n",
      " ('降', 359),\n",
      " ('抗', 349),\n",
      " ('生', 333),\n",
      " ('阿', 306),\n",
      " ('沙', 292),\n",
      " ('体', 266),\n",
      " ('预', 252),\n",
      " ('混', 252),\n",
      " ('酸', 248),\n",
      " ('基', 247),\n",
      " ('唑', 239),\n",
      " ('脂', 239),\n",
      " ('特', 214),\n",
      " ('奈', 211),\n",
      " ('受', 206),\n",
      " ('似', 206),\n",
      " ('服', 205),\n",
      " ('口', 204),\n",
      " ('西', 190),\n",
      " ('那', 189)]\n",
      "=================================\n",
      "[ Duration ]::\n",
      "[('<num>', 1490),\n",
      " ('周', 288),\n",
      " ('<letter>', 230),\n",
      " ('月', 202),\n",
      " ('个', 200),\n",
      " ('年', 190),\n",
      " ('~', 100),\n",
      " ('期', 30),\n",
      " ('.', 30),\n",
      " ('天', 25),\n",
      " ('长', 21),\n",
      " ('以', 17),\n",
      " ('超', 16),\n",
      " ('过', 16),\n",
      " ('上', 15),\n",
      " ('余', 15),\n",
      " ('短', 12),\n",
      " ('程', 11),\n",
      " ('病', 10),\n",
      " ('续', 10),\n",
      " ('持', 9),\n",
      " ('数', 9),\n",
      " ('至', 9),\n",
      " ('后', 8),\n",
      " ('于', 8),\n",
      " ('<', 7),\n",
      " ('前', 7),\n",
      " ('内', 7),\n",
      " ('>', 6),\n",
      " ('时', 6),\n",
      " ('疗', 6),\n",
      " ('半', 6),\n",
      " ('间', 5),\n",
      " ('或', 5),\n",
      " ('少', 5),\n",
      " ('多', 5),\n",
      " ('治', 4),\n",
      " ('≥', 4),\n",
      " ('(', 4),\n",
      " (')', 4),\n",
      " ('大', 4),\n",
      " ('小', 4),\n",
      " ('每', 4),\n",
      " ('中', 3),\n",
      " ('位', 3),\n",
      " ('次', 3),\n",
      " ('±', 3),\n",
      " ('不', 3),\n",
      " ('达', 3),\n",
      " ('日', 3)]\n",
      "=================================\n",
      "[ Frequency ]::\n",
      "[('<num>', 432),\n",
      " ('<letter>', 285),\n",
      " ('次', 267),\n",
      " ('每', 223),\n",
      " ('日', 173),\n",
      " ('~', 52),\n",
      " ('/', 44),\n",
      " ('餐', 42),\n",
      " ('前', 41),\n",
      " ('周', 33),\n",
      " ('天', 31),\n",
      " ('一', 23),\n",
      " ('晚', 21),\n",
      " ('早', 18),\n",
      " ('或', 12),\n",
      " ('注', 7),\n",
      " ('射', 7),\n",
      " ('多', 6),\n",
      " ('月', 6),\n",
      " ('隔', 5),\n",
      " ('监', 5),\n",
      " ('测', 5),\n",
      " ('药', 5),\n",
      " ('睡', 5),\n",
      " ('午', 5),\n",
      " ('(', 4),\n",
      " ('年', 4),\n",
      " ('持', 4),\n",
      " ('续', 4),\n",
      " ('个', 4),\n",
      " ('给', 4),\n",
      " ('时', 4),\n",
      " ('后', 4),\n",
      " ('分', 4),\n",
      " (')', 3),\n",
      " ('至', 3),\n",
      " ('少', 3),\n",
      " ('期', 3),\n",
      " ('两', 3),\n",
      " ('服', 3),\n",
      " ('间', 3),\n",
      " ('、', 3),\n",
      " ('小', 3),\n",
      " ('钟', 3),\n",
      " ('性', 2),\n",
      " ('冲', 2),\n",
      " ('治', 2),\n",
      " ('疗', 2),\n",
      " ('长', 2),\n",
      " ('用', 2)]\n",
      "=================================\n",
      "[ Level ]::\n",
      "[('<num>', 600),\n",
      " ('重', 465),\n",
      " ('度', 355),\n",
      " ('<letter>', 290),\n",
      " ('期', 265),\n",
      " ('严', 248),\n",
      " ('中', 183),\n",
      " ('轻', 137),\n",
      " ('~', 118),\n",
      " ('级', 109),\n",
      " ('%', 74),\n",
      " ('量', 70),\n",
      " ('白', 65),\n",
      " ('性', 62),\n",
      " ('微', 54),\n",
      " ('.', 50),\n",
      " ('高', 44),\n",
      " ('缓', 40),\n",
      " ('显', 39),\n",
      " ('尿', 38),\n",
      " ('解', 38),\n",
      " ('急', 35),\n",
      " ('蛋', 34),\n",
      " ('低', 32),\n",
      " ('大', 30),\n",
      " ('等', 29),\n",
      " ('活', 29),\n",
      " ('动', 28),\n",
      " ('早', 27),\n",
      " ('降', 26),\n",
      " ('常', 26),\n",
      " ('加', 25),\n",
      " ('正', 23),\n",
      " ('症', 23),\n",
      " ('明', 22),\n",
      " ('强', 22),\n",
      " ('临', 21),\n",
      " ('床', 21),\n",
      " ('复', 21),\n",
      " ('、', 20),\n",
      " ('发', 20),\n",
      " ('危', 19),\n",
      " ('亚', 19),\n",
      " ('分', 18),\n",
      " ('慢', 17),\n",
      " ('增', 16),\n",
      " ('晚', 16),\n",
      " ('病', 15),\n",
      " ('著', 15),\n",
      " ('后', 14)]\n",
      "=================================\n",
      "[ Method ]::\n",
      "[('注', 327),\n",
      " ('射', 271),\n",
      " ('服', 200),\n",
      " ('口', 172),\n",
      " ('静', 140),\n",
      " ('餐', 137),\n",
      " ('脉', 126),\n",
      " ('<letter>', 116),\n",
      " ('下', 78),\n",
      " ('前', 75),\n",
      " ('皮', 75),\n",
      " ('用', 60),\n",
      " ('基', 55),\n",
      " ('础', 54),\n",
      " ('<num>', 49),\n",
      " ('腹', 49),\n",
      " ('腔', 49),\n",
      " ('时', 48),\n",
      " ('后', 37),\n",
      " ('联', 30),\n",
      " ('药', 29),\n",
      " ('泵', 28),\n",
      " ('早', 27),\n",
      " ('输', 26),\n",
      " ('滴', 26),\n",
      " ('合', 25),\n",
      " ('晚', 23),\n",
      " ('续', 21),\n",
      " ('即', 20),\n",
      " ('持', 19),\n",
      " ('冲', 17),\n",
      " ('睡', 15),\n",
      " ('内', 15),\n",
      " ('或', 13),\n",
      " ('立', 13),\n",
      " ('间', 12),\n",
      " ('量', 12),\n",
      " ('击', 12),\n",
      " ('疗', 12),\n",
      " ('三', 11),\n",
      " ('治', 11),\n",
      " ('给', 11),\n",
      " ('素', 11),\n",
      " ('次', 9),\n",
      " ('胰', 9),\n",
      " ('岛', 9),\n",
      " ('使', 9),\n",
      " ('单', 8),\n",
      " ('减', 8),\n",
      " ('+', 8)]\n",
      "=================================\n",
      "[ Operation ]::\n",
      "[('<letter>', 488),\n",
      " ('术', 344),\n",
      " ('手', 186),\n",
      " ('植', 113),\n",
      " ('移', 109),\n",
      " ('切', 79),\n",
      " ('除', 74),\n",
      " ('胃', 56),\n",
      " ('减', 54),\n",
      " ('器', 50),\n",
      " ('官', 50),\n",
      " ('腺', 50),\n",
      " ('状', 48),\n",
      " ('肾', 46),\n",
      " ('治', 42),\n",
      " ('疗', 42),\n",
      " ('旁', 35),\n",
      " ('重', 34),\n",
      " ('入', 32),\n",
      " ('路', 31),\n",
      " ('胰', 30),\n",
      " ('-', 30),\n",
      " ('全', 26),\n",
      " ('甲', 24),\n",
      " ('<num>', 24),\n",
      " ('介', 24),\n",
      " ('肢', 21),\n",
      " ('脉', 20),\n",
      " ('外', 19),\n",
      " ('经', 19),\n",
      " ('动', 18),\n",
      " ('科', 17),\n",
      " ('镜', 17),\n",
      " ('体', 17),\n",
      " ('侧', 17),\n",
      " ('截', 17),\n",
      " ('冠', 14),\n",
      " ('垂', 14),\n",
      " ('次', 13),\n",
      " ('部', 12),\n",
      " ('袖', 12),\n",
      " ('麻', 12),\n",
      " ('压', 12),\n",
      " ('肝', 11),\n",
      " ('肠', 11),\n",
      " ('腔', 11),\n",
      " ('肿', 11),\n",
      " ('内', 11),\n",
      " ('皮', 10),\n",
      " ('上', 10)]\n",
      "=================================\n",
      "[ Reason ]::\n",
      "[('<letter>', 1928),\n",
      " ('素', 677),\n",
      " ('胰', 634),\n",
      " ('岛', 609),\n",
      " ('血', 466),\n",
      " ('抗', 464),\n",
      " ('抵', 429),\n",
      " ('高', 372),\n",
      " ('<num>', 346),\n",
      " ('糖', 336),\n",
      " ('性', 303),\n",
      " ('脂', 245),\n",
      " ('化', 236),\n",
      " ('症', 226),\n",
      " ('激', 218),\n",
      " ('应', 203),\n",
      " ('能', 202),\n",
      " ('胞', 199),\n",
      " ('细', 197),\n",
      " ('功', 194),\n",
      " ('炎', 189),\n",
      " ('病', 188),\n",
      " ('的', 170),\n",
      " ('尿', 162),\n",
      " ('氧', 156),\n",
      " ('异', 138),\n",
      " ('常', 133),\n",
      " ('内', 130),\n",
      " ('肥', 124),\n",
      " ('胖', 121),\n",
      " ('增', 120),\n",
      " ('低', 115),\n",
      " ('-', 111),\n",
      " ('分', 110),\n",
      " ('体', 110),\n",
      " ('动', 108),\n",
      " ('管', 106),\n",
      " ('代', 105),\n",
      " ('因', 104),\n",
      " ('损', 102),\n",
      " ('生', 101),\n",
      " ('谢', 101),\n",
      " ('压', 100),\n",
      " ('缺', 93),\n",
      " ('重', 93),\n",
      " ('β', 92),\n",
      " ('肾', 87),\n",
      " ('质', 85),\n",
      " ('水', 84),\n",
      " ('平', 80)]\n",
      "=================================\n",
      "[ SideEff ]::\n",
      "[('血', 316),\n",
      " ('低', 270),\n",
      " ('糖', 266),\n",
      " ('症', 84),\n",
      " ('性', 75),\n",
      " ('心', 60),\n",
      " ('增', 57),\n",
      " ('重', 54),\n",
      " ('<letter>', 53),\n",
      " ('体', 52),\n",
      " ('加', 45),\n",
      " ('高', 41),\n",
      " ('钠', 41),\n",
      " ('应', 32),\n",
      " ('事', 31),\n",
      " ('件', 31),\n",
      " ('反', 30),\n",
      " ('道', 28),\n",
      " ('状', 26),\n",
      " ('肝', 25),\n",
      " ('痛', 22),\n",
      " ('管', 22),\n",
      " ('尿', 20),\n",
      " ('不', 19),\n",
      " ('升', 19),\n",
      " ('减', 19),\n",
      " ('毒', 19),\n",
      " ('损', 19),\n",
      " ('过', 18),\n",
      " ('发', 17),\n",
      " ('感', 17),\n",
      " ('病', 17),\n",
      " ('肾', 17),\n",
      " ('夜', 17),\n",
      " ('间', 17),\n",
      " ('细', 16),\n",
      " ('骨', 15),\n",
      " ('代', 15),\n",
      " ('谢', 15),\n",
      " ('肠', 15),\n",
      " ('胞', 15),\n",
      " ('死', 14),\n",
      " ('水', 14),\n",
      " ('腹', 14),\n",
      " ('中', 14),\n",
      " ('率', 13),\n",
      " ('功', 13),\n",
      " ('能', 13),\n",
      " ('胃', 13),\n",
      " ('良', 12)]\n",
      "=================================\n",
      "[ Symptom ]::\n",
      "[('血', 810),\n",
      " ('糖', 637),\n",
      " ('高', 434),\n",
      " ('低', 348),\n",
      " ('性', 306),\n",
      " ('<letter>', 289),\n",
      " ('白', 282),\n",
      " ('尿', 268),\n",
      " ('痛', 218),\n",
      " ('常', 202),\n",
      " ('素', 193),\n",
      " ('力', 184),\n",
      " ('发', 178),\n",
      " ('无', 178),\n",
      " ('异', 165),\n",
      " ('蛋', 164),\n",
      " ('状', 163),\n",
      " ('症', 161),\n",
      " ('胰', 150),\n",
      " ('增', 146),\n",
      " ('下', 146),\n",
      " ('心', 145),\n",
      " ('多', 145),\n",
      " ('肿', 143),\n",
      " ('动', 142),\n",
      " ('岛', 139),\n",
      " ('体', 133),\n",
      " ('不', 125),\n",
      " ('眼', 124),\n",
      " ('双', 122),\n",
      " ('能', 119),\n",
      " ('脂', 118),\n",
      " ('量', 112),\n",
      " ('骨', 109),\n",
      " ('经', 109),\n",
      " ('功', 105),\n",
      " ('大', 103),\n",
      " ('重', 103),\n",
      " ('水', 99),\n",
      " ('皮', 94),\n",
      " ('代', 93),\n",
      " ('肾', 93),\n",
      " ('损', 92),\n",
      " ('谢', 91),\n",
      " ('腹', 91),\n",
      " ('<num>', 89),\n",
      " ('减', 89),\n",
      " ('及', 89),\n",
      " ('反', 88),\n",
      " ('抗', 86)]\n",
      "=================================\n",
      "[ Test ]::\n",
      "[('<letter>', 71089),\n",
      " ('<num>', 8699),\n",
      " ('血', 7189),\n",
      " ('糖', 5349),\n",
      " ('-', 4819),\n",
      " ('素', 3869),\n",
      " ('体', 2740),\n",
      " ('脂', 2216),\n",
      " ('重', 2183),\n",
      " ('白', 1977),\n",
      " ('胰', 1758),\n",
      " ('岛', 1578),\n",
      " ('尿', 1575),\n",
      " ('蛋', 1575),\n",
      " ('度', 1365),\n",
      " ('压', 1356),\n",
      " ('腹', 1348),\n",
      " ('酸', 1337),\n",
      " ('空', 1271),\n",
      " ('数', 1255),\n",
      " ('醇', 1239),\n",
      " ('指', 1233),\n",
      " ('胆', 1090),\n",
      " ('能', 1080),\n",
      " ('固', 1059),\n",
      " ('功', 1052),\n",
      " ('密', 1034),\n",
      " ('骨', 945),\n",
      " ('腺', 834),\n",
      " ('甲', 780),\n",
      " ('状', 778),\n",
      " ('腰', 762),\n",
      " ('高', 728),\n",
      " ('量', 693),\n",
      " ('围', 676),\n",
      " ('肾', 665),\n",
      " ('细', 633),\n",
      " ('三', 616),\n",
      " ('胞', 603),\n",
      " ('甘', 599),\n",
      " ('β', 595),\n",
      " ('油', 590),\n",
      " ('激', 529),\n",
      " ('抗', 520),\n",
      " ('酶', 513),\n",
      " ('性', 513),\n",
      " ('联', 501),\n",
      " ('化', 498),\n",
      " ('收', 487),\n",
      " ('总', 482)]\n",
      "=================================\n",
      "[ Test_Value ]::\n",
      "[('<num>', 17182),\n",
      " ('<letter>', 14093),\n",
      " ('.', 4132),\n",
      " ('/', 2938),\n",
      " ('高', 1245),\n",
      " ('降', 876),\n",
      " ('%', 856),\n",
      " ('升', 830),\n",
      " ('低', 728),\n",
      " ('~', 704),\n",
      " ('显', 642),\n",
      " ('<', 605),\n",
      " ('±', 585),\n",
      " ('增', 533),\n",
      " (')', 508),\n",
      " ('≥', 506),\n",
      " ('(', 474),\n",
      " ('下', 449),\n",
      " ('明', 396),\n",
      " ('常', 378),\n",
      " ('性', 350),\n",
      " ('>', 335),\n",
      " ('正', 326),\n",
      " ('加', 256),\n",
      " ('著', 240),\n",
      " ('于', 235),\n",
      " ('μ', 179),\n",
      " ('减', 178),\n",
      " ('阳', 155),\n",
      " ('上', 146),\n",
      " ('≤', 128),\n",
      " ('少', 118),\n",
      " ('阴', 107),\n",
      " ('对', 107),\n",
      " ('不', 89),\n",
      " ('×', 82),\n",
      " ('度', 81),\n",
      " ('次', 81),\n",
      " ('大', 75),\n",
      " ('分', 64),\n",
      " ('小', 64),\n",
      " ('调', 63),\n",
      " ('变', 62),\n",
      " ('较', 61),\n",
      " ('水', 57),\n",
      " ('达', 57),\n",
      " ('见', 56),\n",
      " ('异', 56),\n",
      " ('平', 55),\n",
      " ('细', 53)]\n",
      "=================================\n",
      "[ Treatment ]::\n",
      "[('<letter>', 503),\n",
      " ('疗', 289),\n",
      " ('治', 239),\n",
      " ('食', 130),\n",
      " ('<num>', 116),\n",
      " ('动', 110),\n",
      " ('限', 100),\n",
      " ('干', 99),\n",
      " ('运', 99),\n",
      " ('细', 81),\n",
      " ('胞', 81),\n",
      " ('抗', 71),\n",
      " ('制', 61),\n",
      " ('液', 57),\n",
      " ('氧', 57),\n",
      " ('化', 51),\n",
      " ('血', 50),\n",
      " ('透', 50),\n",
      " ('阻', 48),\n",
      " ('有', 48),\n",
      " ('析', 47),\n",
      " ('放', 47),\n",
      " ('补', 46),\n",
      " ('力', 46),\n",
      " ('糖', 45),\n",
      " ('植', 43),\n",
      " ('移', 42),\n",
      " ('射', 42),\n",
      " ('素', 41),\n",
      " ('降', 39),\n",
      " ('低', 35),\n",
      " ('-', 33),\n",
      " ('饮', 32),\n",
      " ('量', 32),\n",
      " ('减', 31),\n",
      " ('方', 28),\n",
      " ('生', 27),\n",
      " ('法', 27),\n",
      " ('式', 26),\n",
      " ('体', 26),\n",
      " ('激', 26),\n",
      " ('预', 25),\n",
      " ('性', 25),\n",
      " ('质', 25),\n",
      " ('代', 24),\n",
      " ('间', 24),\n",
      " ('活', 23),\n",
      " ('免', 23),\n",
      " ('疫', 23),\n",
      " ('热', 23)]\n"
     ]
    }
   ],
   "source": [
    "from matplotlib.font_manager import *\n",
    "myfont = FontProperties(fname='/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc')\n",
    "\n",
    "#plt.rcParams['font.sans-serif']=['WenQuanYi Micro Hei Mono'] #用来正常显示中文标签\n",
    "plt.rcParams['axes.unicode_minus']=False  #用来正常显示负号\n",
    "char_count = {}\n",
    "for k, g in groupby(data, key=lambda x: x[0]):\n",
    "    entities = [item[-1] for item in g]\n",
    "    cnt = defaultdict(int)\n",
    "    for c in \"\".join(entities).replace(\" \", \"\"):\n",
    "        if c.isdigit():\n",
    "            cnt['<num>'] += 1\n",
    "        elif re.search(\"[a-zA-Z]\", c):\n",
    "            cnt[\"<letter>\"] += 1\n",
    "        else:\n",
    "            cnt[c] += 1\n",
    "    cnt_new = sorted(cnt.items(), key=lambda x: -x[1])[:50]\n",
    "    print(\"=================================\")\n",
    "    print(\"[\", k, \"]::\")\n",
    "    pprint(cnt_new)\n",
    "#     plt.bar(range(len(char)), count, tick_label=char)\n",
    "#     plt.xtick(char, fontproperties=myfont)\n",
    "#     break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('血', 6383), ('糖', 5889), ('胰', 5621), ('T', 4856), ('2', 4733), ('H', 3847), ('G', 2961), ('A', 2719), ('肾', 2525), ('F', 2489), ('体', 2362), ('高', 2335), ('1', 2325), ('骨', 2296), ('C', 2294), ('M', 2189), ('脂', 2155), ('甲', 2055), ('I', 1986), ('P', 1956), ('D', 1939), ('L', 1822), ('心', 1797), ('S', 1638), ('低', 1608), ('B', 1498), ('肥', 1396), ('空', 1378), ('尿', 1335), ('β', 1276), ('肝', 1172), ('二', 1150), ('N', 1042), ('R', 914), ('m', 863), ('腰', 821), ('代', 729), ('格', 666), ('内', 664), ('利', 642), ('3', 631), ('冠', 600), ('<', 583), ('甘', 565), ('升', 561), ('中', 560), ('降', 560), ('超', 556), ('维', 556), ('下', 534), ('餐', 521), ('5', 510), ('动', 494), ('≥', 493), ('口', 483), ('慢', 460), ('成', 452), ('增', 451), ('总', 450), ('收', 446), ('0', 445), ('4', 439), ('肌', 420), ('磺', 389), ('白', 370), ('舒', 368), ('基', 366), ('(', 363), ('结', 363), ('明', 358), ('V', 356), ('抗', 352), ('脑', 350), ('腹', 343), ('正', 339), ('O', 336), ('颈', 334), ('肿', 332), ('阿', 328), ('U', 327), ('他', 327), ('自', 326), ('7', 321), ('肠', 320), ('>', 315), ('皮', 314), ('p', 312), ('非', 307), ('严', 302), ('e', 301), ('微', 300), ('妊', 292), ('E', 284), ('垂', 283), ('身', 282), ('双', 280), ('大', 279), ('胆', 275), ('亚', 275), ('6', 273), ('炎', 270), ('葡', 266), ('神', 265), ('每', 260), ('显', 260), ('8', 256), ('静', 255), ('性', 247), ('多', 236), ('减', 235), ('细', 228), ('生', 226), ('死', 224), ('左', 223), ('W', 215), ('痛', 214), ('胃', 212), ('急', 211), ('无', 211), ('K', 209), ('外', 208), ('睡', 207), ('Z', 205), ('单', 203), ('全', 196), ('雌', 193), ('预', 189), ('s', 183), ('沙', 181), ('游', 180), ('西', 179), ('谷', 178), ('h', 176), ('早', 176), ('轻', 172), ('重', 167), ('视', 166), ('免', 165), ('眼', 162), ('线', 161), ('睾', 160), ('巨', 158), ('感', 156), ('9', 154), ('激', 154), ('三', 153), ('蛋', 151), ('前', 149), ('阳', 149), ('罗', 149), ('c', 147), ('促', 145), ('肺', 143), ('原', 141), ('氧', 138), ('活', 137), ('病', 134), ('足', 132), ('淋', 131), ('噻', 131), ('小', 125), ('酮', 125), ('胱', 125), ('阴', 122), ('艾', 121), ('夜', 119), ('抑', 117), ('不', 114), ('注', 114), ('右', 112), ('转', 112), ('恶', 111), ('周', 111), ('瑞', 111), ('胎', 109), ('赖', 109), ('破', 105), ('干', 105), ('股', 102), ('α', 101), ('长', 99), ('人', 95), ('乳', 95), ('上', 94), ('年', 94), ('臀', 93), ('磷', 93), ('趋', 93), ('缺', 93), ('胸', 92), ('催', 92), ('钙', 91), ('环', 90), ('追', 89), ('≤', 88), ('手', 88), ('出', 88), ('短', 87), ('吡', 86), ('持', 85), ('棕', 84), ('卒', 83), ('丙', 82), ('电', 82), ('系', 81), ('网', 81), ('晚', 81), ('放', 81), ('J', 80), ('黎', 80), ('瘦', 78), ('缬', 75), ('头', 74), ('卵', 72), ('精', 71), ('n', 71), ('平', 70), ('新', 70), ('雄', 70), ('发', 70), ('较', 68), ('营', 68), ('o', 67), ('有', 67), ('限', 67), ('第', 66), ('氯', 65), ('组', 64), ('器', 64), ('水', 62), ('过', 61), ('Y', 61), ('消', 60), ('腺', 60), ('黄', 60), ('先', 60), ('姜', 60), ('异', 59), ('核', 59), ('分', 59), ('乏', 57), ('关', 56), ('稳', 55), ('间', 55), ('溃', 55), ('达', 55), ('毛', 54), ('一', 54), ('终', 54), ('继', 54), ('链', 54), ('门', 53), ('孕', 53), ('护', 53), ('类', 52), ('纤', 51), ('雷', 51), ('脐', 50), ('儿', 50), ('贝', 50), ('饮', 50), ('强', 49), ('坐', 49), ('未', 49), ('常', 48), ('地', 48), ('吸', 47), ('q', 46), ('髋', 45), ('临', 45), ('铁', 45), ('海', 44), ('复', 44), ('卡', 44), ('随', 44), ('流', 43), ('i', 42), ('a', 42), ('主', 42), ('移', 42), ('透', 42), ('子', 41), ('γ', 41), ('极', 41), ('漆', 41), ('癌', 40), ('醛', 40), ('调', 39), ('日', 39), ('软', 39), ('嗜', 39), ('特', 39), ('Q', 39), ('逐', 39), ('月', 39), ('椎', 38), ('交', 38), ('桥', 38), ('速', 38), ('唑', 38), ('氢', 38), ('依', 38), ('补', 38), ('最', 37), ('t', 37), ('致', 37), ('斑', 37), ('运', 37), ('药', 37), ('安', 37), ('缓', 37), ('替', 37), ('脱', 36), ('胚', 35), ('突', 35), ('粥', 35), ('术', 35), ('疼', 35), ('踝', 34), ('髓', 34), ('u', 34), ('应', 33), ('红', 33), ('颅', 33), ('认', 33), ('那', 33), ('呼', 32), ('碱', 32), ('脊', 31), ('反', 31), ('硫', 31), ('d', 31), ('青', 31), ('克', 31), ('苯', 31), ('恩', 31), ('提', 31), ('远', 30), ('表', 30), ('对', 30), ('硒', 30), ('经', 30), ('女', 30), ('b', 29), ('肱', 29), ('造', 29), ('直', 29), ('遗', 29), ('乙', 28), ('胶', 28), ('可', 28), ('男', 28), ('影', 28), ('呕', 28), ('囊', 27), ('脉', 27), ('热', 27), ('局', 27), ('去', 27), ('盐', 27), ('进', 27), ('两', 26), ('库', 26), ('泌', 26), ('r', 26), ('钠', 26), ('l', 26), ('X', 26), ('排', 26), ('再', 26), ('功', 26), ('膳', 26), ('同', 25), ('产', 25), ('截', 25), ('羊', 25), ('v', 24), ('载', 24), ('凋', 24), ('四', 24), ('气', 24), ('绝', 24), ('假', 24), ('加', 24), ('脆', 23), ('普', 23), ('改', 23), ('联', 23), ('光', 23), ('食', 22), ('泡', 22), ('近', 22), ('初', 22), ('碳', 22), ('老', 22), ('贫', 22), ('别', 22), ('染', 21), ('后', 21), ('天', 21), ('胫', 21), ('含', 21), ('肉', 21), ('唾', 21), ('碘', 21), ('实', 21), ('膝', 21), ('充', 21), ('奥', 21), ('髂', 20), ('滤', 20), ('耳', 20), ('听', 20), ('混', 20), ('霉', 20), ('控', 20), ('氨', 19), ('黑', 19), ('酸', 19), ('酒', 19), ('息', 19), ('化', 19), ('闭', 19), ('g', 18), ('钾', 18), ('合', 18), ('房', 18), ('吲', 18), ('秋', 18), ('△', 18), ('半', 17), ('叉', 17), ('溶', 17), ('丝', 17), ('支', 17), ('螺', 17), ('质', 17), ('失', 17), ('狭', 17), ('家', 17), ('曲', 17), ('午', 17), ('穿', 17), ('介', 17), ('怕', 17), ('起', 16), ('各', 16), ('创', 16), ('醋', 16), ('边', 16), ('循', 15), ('膀', 15), ('回', 15), ('延', 15), ('角', 15), ('球', 15), ('硝', 15), ('顽', 15), ('能', 15), ('米', 15), ('选', 15), ('巴', 15), ('烟', 15), ('+', 15), ('偏', 15), ('估', 15), ('±', 15), ('附', 14), ('尾', 14), ('黏', 14), ('迷', 14), ('牙', 14), ('躯', 14), ('摄', 14), ('轴', 14), ('胞', 14), ('咽', 14), ('清', 14), ('相', 14), ('眶', 14), ('伏', 14), ('围', 14), ('金', 14), ('并', 14), ('氟', 14), ('脾', 13), ('靶', 13), ('锌', 13), ('密', 13), ('少', 13), ('智', 13), ('阻', 13), ('败', 13), ('宫', 13), ('优', 13), ('禁', 13), ('咳', 13), ('更', 13), ('呈', 13), ('磁', 13)]\n"
     ]
    }
   ],
   "source": [
    "# 特殊字符\n",
    "s1 = \"\".join([d[-1][0] for d in data])\n",
    "print(Counter(s).most_common(500))\n",
    "# symbol = tuple(set(re.findall(\"\\W\", s)))\n",
    "# print(symbol)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('病', 10698), ('糖', 6044), ('素', 5847), ('胞', 4748), ('C', 4412), ('L', 2621), ('1', 2584), ('G', 2524), ('R', 2432), ('S', 2233), ('A', 2119), ('重', 2103), ('压', 2002), ('D', 1977), ('症', 1753), ('H', 1732), ('胖', 1722), ('M', 1688), ('4', 1582), ('T', 1545), ('高', 1515), ('I', 1496), ('醇', 1386), ('酸', 1332), ('剂', 1276), ('数', 1231), ('2', 1207), ('物', 1172), ('度', 1076), ('3', 1065), ('白', 1060), ('P', 1055), ('常', 1050), ('抗', 1035), ('胍', 1031), ('能', 989), ('征', 960), ('肽', 933), ('脏', 843), ('化', 829), ('汀', 826), ('%', 818), ('N', 818), ('s', 816), ('变', 806), ('体', 764), ('g', 738), ('织', 720), ('酮', 714), ('酶', 701), ('围', 684), ('瘤', 661), ('低', 647), ('量', 646), ('性', 633), ('6', 618), ('癌', 578), ('期', 571), ('β', 527), ('酯', 526), ('l', 523), ('5', 507), ('0', 495), ('平', 490), ('炎', 483), ('d', 477), ('B', 476), ('腺', 474), ('n', 450), ('管', 449), ('率', 437), ('K', 433), ('降', 425), (')', 413), ('b', 400), ('脂', 398), ('脉', 395), ('术', 390), ('减', 387), ('松', 372), ('亡', 370), ('亢', 363), ('加', 362), ('子', 359), ('因', 324), ('疗', 324), ('α', 321), ('脲', 320), ('损', 319), ('a', 311), ('尿', 311), ('药', 307), ('岛', 306), ('比', 305), ('乱', 301), ('m', 296), ('骨', 296), ('酐', 295), ('类', 294), ('周', 293), ('E', 288), ('射', 288), ('O', 286), ('件', 286), ('全', 285), ('t', 283), ('肪', 282), ('折', 279), ('碍', 279), ('染', 279), ('h', 252), ('肝', 252), ('次', 248), ('F', 241), ('验', 241), ('质', 240), ('7', 233), ('伤', 233), ('痛', 228), ('应', 220), ('积', 218), ('道', 215), ('中', 209), ('退', 209), ('血', 207), ('动', 203), ('风', 198), ('经', 197), ('值', 195), ('月', 193), ('竭', 184), ('死', 177), ('9', 176), ('服', 175), ('年', 173), ('长', 173), ('少', 169), ('激', 169), ('U', 166), ('钙', 162), ('X', 160), ('力', 159), ('钠', 158), ('足', 155), ('查', 155), ('8', 152), ('肉', 152), ('石', 150), ('奈', 148), ('片', 147), ('肿', 147), ('标', 144), ('肌', 143), ('膜', 142), ('害', 139), ('植', 139), ('声', 137), ('节', 135), ('食', 135), ('毒', 133), ('油', 133), ('统', 131), ('i', 130), ('特', 130), ('于', 130), ('乏', 126), ('V', 119), ('级', 117), ('嗪', 117), ('坦', 117), ('象', 112), ('位', 111), ('钾', 111), ('分', 111), ('大', 108), ('间', 103), ('法', 102), ('疡', 101), ('龄', 100), ('龙', 100), ('液', 98), ('原', 97), ('γ', 97), ('氮', 97), ('部', 94), ('x', 94), ('测', 94), ('谢', 93), ('多', 92), ('状', 90), ('骼', 89), ('路', 89), ('磷', 89), ('调', 88), ('块', 85), ('产', 82), ('镜', 81), ('像', 80), ('脑', 78), ('用', 78), ('前', 78), ('析', 78), ('窄', 77), ('良', 76), ('球', 75), ('润', 75), ('态', 74), ('成', 73), ('塞', 73), ('肾', 72), ('皮', 72), ('失', 72), ('肠', 71), ('理', 71), ('型', 70), ('颈', 69), ('轴', 69), ('衰', 69), ('谱', 66), ('生', 65), ('郁', 65), ('超', 65), ('元', 64), ('程', 63), ('差', 63), ('达', 62), ('厚', 61), ('注', 61), ('升', 60), ('胺', 60), ('形', 60), ('唑', 60), ('c', 59), ('下', 58), ('泌', 57), ('司', 56), ('净', 56), ('盐', 55), ('影', 55), ('环', 53), ('心', 53), ('热', 53), ('检', 52), ('e', 51), ('氧', 51), ('儿', 51), ('史', 51), ('图', 51), ('p', 50), ('镁', 50), ('碘', 50), ('尔', 50), ('Z', 50), ('进', 49), ('r', 47), ('轻', 47), ('肢', 46), ('时', 46), ('小', 45), ('+', 45), ('核', 44), ('陷', 44), ('醛', 43), ('制', 43), ('Y', 42), ('水', 42), ('天', 41), ('区', 41), ('碱', 41), ('菌', 41), ('显', 40), ('髋', 39), ('巢', 39), ('径', 39), ('上', 39), ('胀', 39), ('利', 39), ('发', 38), ('缩', 38), ('清', 37), ('础', 37), ('泻', 37), ('林', 37), ('布', 37), ('倍', 37), ('椎', 36), ('泵', 36), ('限', 36), ('胃', 35), ('烟', 35), ('胎', 34), ('网', 34), ('眼', 34), ('聋', 34), ('吐', 34), ('克', 33), ('叶', 33), ('≤', 33), ('尼', 33), ('解', 32), ('规', 32), ('支', 31), ('囊', 31), ('泡', 31), ('局', 31), ('定', 31), ('群', 30), ('适', 30), ('娠', 29), ('险', 29), ('出', 29), ('律', 29), ('饮', 29), ('音', 29), ('色', 29), ('作', 28), ('髓', 28), ('层', 28), ('入', 28), ('别', 28), ('学', 28), ('呤', 27), ('板', 27), ('族', 27), ('强', 27), ('悸', 27), ('瘦', 27), ('粒', 26), ('合', 26), ('干', 26), ('感', 26), ('后', 26), ('盘', 25), ('壁', 25), ('肺', 25), ('塑', 25), ('衡', 25), ('齐', 25), ('℃', 25), ('滴', 24), ('弱', 24), ('式', 24), ('留', 24), ('腔', 23), ('肤', 23), ('颤', 23), ('簇', 22), ('酒', 22), ('善', 22), ('预', 22), ('荷', 22), ('养', 21), ('张', 21), ('慢', 21), ('快', 21), ('难', 21), ('例', 21), ('温', 21), ('构', 20), ('啶', 20), ('活', 20), ('者', 20), ('缓', 20), ('疮', 20), ('疹', 20), ('除', 20), ('岁', 20), ('搏', 20), ('组', 19), ('结', 19), ('段', 19), ('功', 19), ('着', 19), ('育', 19), ('迫', 19), ('硒', 19), ('视', 19), ('木', 19), ('描', 19), ('丸', 18), ('卵', 18), ('床', 18), ('痫', 18), ('移', 18), ('W', 18), ('氯', 18), ('日', 18), ('粗', 18), ('汗', 18), ('缘', 17), ('的', 17), ('疫', 17), ('他', 17), ('著', 17), ('取', 17), ('吸', 17), ('点', 16), ('迷', 16), ('现', 16), ('况', 16), ('梗', 16), ('搐', 16), ('坏', 16), ('等', 16), ('余', 16), ('冶', 15), ('毛', 15), ('o', 15), ('链', 15), ('危', 15), ('孕', 15), ('疸', 15), ('觉', 15), ('刺', 15), ('迹', 15), ('灶', 14), ('苷', 14), ('破', 14), ('连', 14), ('晕', 14), ('振', 14), ('宫', 13), ('障', 13), ('内', 13), ('带', 13), ('通', 13), ('速', 13), ('裂', 13), ('黄', 13), ('丁', 13), ('佳', 13), ('闷', 13), ('匀', 13), ('号', 12), ('室', 12), ('侧', 12), ('斑', 12), ('阻', 12), ('熟', 12), ('迟', 12), ('宽', 12), ('乐', 12), ('击', 12), ('微', 12), ('睡', 12), ('项', 12), ('端', 11), ('线', 11), ('器', 11), ('维', 11), ('f', 11), ('收', 11), ('底', 11), ('明', 11), ('疽', 11), ('剥', 11), ('传', 11), ('异', 11), ('米', 11), ('断', 11), ('短', 11), ('说', 11), ('嗽', 11), ('q', 10), ('臂', 10), ('k', 10), ('栓', 10), ('累', 10), ('呆', 10), ('整', 10), ('乳', 10), ('糊', 10), ('在', 10), ('嵴', 9), ('膈', 9), ('细', 9), ('u', 9), ('阴', 9), ('流', 9), ('境', 9), ('残', 9), ('饰', 9), ('如', 9), ('芬', 9), ('治', 9), ('复', 9), ('潮', 9), ('促', 9), ('光', 9), ('软', 9), ('梁', 8), ('窦', 8), ('浆', 8), ('剩', 8), ('受', 8), ('喘', 8), ('滞', 8), ('溃', 8), ('均', 8), ('宁', 8)]\n"
     ]
    }
   ],
   "source": [
    "s2 = \"\".join([d[-1][-1] for d in data])\n",
    "print(Counter(s2).most_common(500))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
